# gapminder # note the rows column was unlabeled - X1 for now
gapminder <- read_csv("gapminder_clean.csv") %>%
as_tibble()
## New names:
## Rows: 2607 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Country Name, continent dbl (18): ...1, Year, Agriculture, value added (%
## of GDP), CO2 emissions (me...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
gapminder1962 <- gapminder %>%
filter(Year == 1962)
ggplot(gapminder1962, aes(`CO2 emissions (metric tons per capita)`, gdpPercap)) +
geom_point()
## Warning: Removed 151 rows containing missing values (geom_point).
gapminder1962_cor <- cor.test(gapminder1962$`CO2 emissions (metric tons per capita)` , gapminder1962$gdpPercap, use = "complete.obs")
gapminder1962_p <- gapminder1962_cor$p.value %>%
signif(3)
gapminder1962_c <- gapminder1962_cor$estimate %>%
round(2)
pearson_output <- paste0("Pearson correlation: ",gapminder1962_c,"\n")
pvalue_output <- paste0("P-value: ", gapminder1962_p)
Pearson correlation: 0.93
P-value: 1.13e-46
years <- gapminder$Year %>% unique()
coors <- c()
for(y in years){
gapminderyear <- gapminder %>%
filter(Year == y)
gap_cor <- cor.test(gapminderyear$`CO2 emissions (metric tons per capita)` , gapminderyear$gdpPercap, use = "complete.obs") %>%
.$estimate %>%
round(3)
cat(paste0(y, " pearson correlation: ",gap_cor,"\n"))
coors <- c(coors, gap_cor)
}
## 1962 pearson correlation: 0.926
## 1967 pearson correlation: 0.939
## 1972 pearson correlation: 0.843
## 1977 pearson correlation: 0.793
## 1982 pearson correlation: 0.817
## 1987 pearson correlation: 0.81
## 1992 pearson correlation: 0.809
## 1997 pearson correlation: 0.808
## 2002 pearson correlation: 0.801
## 2007 pearson correlation: 0.72
max_year <- years[coors == max(coors)]
max_year_output <- paste0("\n", "Correlation is strongest in ",max_year)
Correlation is strongest in 1967
plot1 <- gapminder %>%
filter(Year == max_year) %>%
ggplot(aes(x = `CO2 emissions (metric tons per capita)`, y = gdpPercap, size = pop, color = continent)) +
geom_point()
ggplotly(plot1)
# Filter out NAs
gapminder2 <- gapminder %>%
filter(!is.na('Energy use (kg of oil equivalent per capita)'),
!is.na(continent)) %>%
rename(energy = 'Energy use (kg of oil equivalent per capita)' )
# Run anova, printp-value
gapminder_aov <- aov(energy ~ continent, data = gapminder2, na.action = na.omit) %>%
summary()
# gapminder_aov[[1]]
gapminder_aovP <- gapminder_aov[[1]][1,5]
# Plot boxplots
ggplot(gapminder2, aes(continent, energy)) +
# geom_violin() +
geom_boxplot() +
ylab("Energy use (kg of oil equivalent per capita)")
## Warning: Removed 436 rows containing non-finite values (stat_boxplot).
# answer
energy_output <- paste("A: There are big differences in energy use across the 5 continents. This can be seen in the boxplots and is highly significant by ANOV. p-value:", gapminder_aovP)
A: There are big differences in energy use across the 5 continents. This can be seen in the boxplots and is highly significant by ANOV. p-value: 8.52700348715528e-39
gapminder1990 <- gapminder %>%
filter(Year > 1990, (continent == "Asia" | continent == "Europe"))
test_output <- t.test(filter(gapminder1990, continent == "Asia")$`Imports of goods and services (% of GDP)`,
filter(gapminder1990, continent == "Europe")$`Imports of goods and services (% of GDP)`)
ggplot(gapminder1990, aes(continent, `Imports of goods and services (% of GDP)`)) +
geom_boxplot()
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
output <- paste("A: There is not a significant difference between the two, as seen in the boxplot and quantified by t.test. p-value:", test_output$p.value)
A: There is not a significant difference between the two, as seen in the boxplot and quantified by t.test. p-value: 0.177569118980769
### Average for each country ###
gapminder_countries <- gapminder %>%
group_by(`Country Name`) %>%
summarise_at(vars(`Population density (people per sq. km of land area)`), list(avg_density = mean)) %>%
arrange(desc(avg_density))
# print head and plot
gapminder_countries %>% head()
## # A tibble: 6 × 2
## `Country Name` avg_density
## <chr> <dbl>
## 1 Macao SAR, China 14732.
## 2 Monaco 14090.
## 3 Hong Kong SAR, China 5153.
## 4 Singapore 4361.
## 5 Gibraltar 2622.
## 6 Bermuda 1133.
plot1 <- ggplot(gapminder_countries, aes(x=`Country Name`, y=avg_density)) +
geom_col() +
theme(axis.text.x=element_blank())
ggplotly(plot1)
## Warning: Removed 9 rows containing missing values (position_stack).
### Rank across each country and plot ##
# make blank df
gapminder_countries_rank <- gapminder %>%
mutate(rank = 1) %>%
filter(rank == 2)
# Rank countries each year
for(y in years){
gapminder1 <- gapminder %>%
filter(Year == y) %>%
mutate(rank = rank(- `Population density (people per sq. km of land area)`))
head(gapminder1)
gapminder_countries_rank <- rbind(gapminder_countries_rank, gapminder1)
}
# Average for each country
gapminder_countries_rank <- gapminder_countries_rank %>%
group_by(`Country Name`) %>%
summarise_at(vars(rank), list(avg_density_rank = mean)) %>%
arrange(avg_density_rank)
# print head
gapminder_countries_rank %>% head()
## # A tibble: 6 × 2
## `Country Name` avg_density_rank
## <chr> <dbl>
## 1 Macao SAR, China 1.5
## 2 Monaco 1.5
## 3 Hong Kong SAR, China 3.1
## 4 Singapore 3.9
## 5 Gibraltar 5
## 6 Bermuda 6.2
output <- paste0("A: The highest average across all years is ", gapminder_countries[1,]$`Country Name`,". However, the highest average RANKING (rank 1-263, averaged across the 10 years of data, if they exist for the country) is a tie between these two countries: ", gapminder_countries_rank[1,]$`Country Name`, " and ", gapminder_countries_rank[2,]$`Country Name`)
A: The highest average across all years is Macao SAR, China. However, the highest average RANKING (rank 1-263, averaged across the 10 years of data, if they exist for the country) is a tie between these two countries: Macao SAR, China and Monaco
# List countries, set baseline in 1962
countries <- gapminder$`Country Name` %>% unique()
gapminder1962 <- filter(gapminder, Year == 1962, !is.na(`Life expectancy at birth, total (years)`))
gapminder_exp <- mutate(gapminder, Life_expectancy_vs_1962 = 1) %>%
filter(Life_expectancy_vs_1962 == 2)
# Make new dataframe with life exp vs 1962
for(c in countries){
exp1962 <- filter(gapminder1962, `Country Name` == c)[1,]
gapminder_exp1 <- filter(gapminder, `Country Name` == c) %>%
mutate(Life_expectancy_vs_1962 = `Life expectancy at birth, total (years)` - exp1962$`Life expectancy at birth, total (years)`)
gapminder_exp <- rbind(gapminder_exp, gapminder_exp1)
}
# Plot
plot1 <- gapminder_exp %>%
ggplot(aes(x = Year, y = Life_expectancy_vs_1962, color = `Country Name`)) +
geom_line()
ggplotly(plot1)
# Extract top value
gapminder_exp_top <- gapminder_exp %>%
arrange(desc(Life_expectancy_vs_1962))
output <- paste(gapminder_exp_top[1,]$`Country Name`, "has shown the greatest increase in life expectancy.")
Maldives has shown the greatest increase in life expectancy.